import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
import seaborn as sns
import scipy
import math
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)
import psycopg2 as pg
import pandas.io.sql as pd_sql
import warnings
warnings.filterwarnings('ignore')
connection_args = {
'host': 'localhost',
'dbname': 'cps',
'port': 5432
}
conn = pg.connect(**connection_args)
hs_records_1617 = ('''SELECT COUNT(school_id) FROM allschools_1617
WHERE is_high_school = 'Y'
''')
cursor = conn.cursor()
cursor.execute(hs_records_1617)
print("There are %s high schools in the 2016/17 dataset" %cursor.fetchall()[0][0])
hs_gr_null_1617 = ('''SELECT COUNT(school_id) FROM allschools_1617
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_null_1617)
print("There are %s high schools records in the 2016/17 dataset with null graduation rates"
%cursor.fetchall()[0][0])
hs_gr_notnull_1617 = ('''SELECT COUNT(school_id) FROM allschools_1617
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1617)
print("There are %s high schools records in the 2016/17 dataset with non-null graduation rates" %cursor.fetchall()[0][0])
Now let's look at 2017/18
hs_records_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
WHERE is_high_school = 'Y'
''')
cursor = conn.cursor()
cursor.execute(hs_records_1718)
print("There are %s high schools in the 2017/18 dataset" %cursor.fetchall()[0][0])
hs_gr_null_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_null_1718)
print("There are %s high schools records in the 2017/18 dataset with null graduation rates"
%cursor.fetchall()[0][0])
hs_gr_notnull_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1718)
print("There are %s high schools records in the 2017/18 dataset with non-null graduation rates" %cursor.fetchall()[0][0])
Next, let's look at the variance of graduation rates across both years.
# first, create lists of the graduation rates across both years.
hs_gr_notnull_1617 = ('''SELECT graduation_rate_school FROM allschools_1617
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1617)
grad_rates_1617 = [rate[0] for rate in cursor.fetchall()]
var_1617 = gradrate_var(grad_rates_1617)
hs_gr_notnull_1718 = ('''SELECT graduation_rate_school FROM allschools_1718
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL
''')
cursor = conn.cursor()
cursor.execute(hs_gr_notnull_1718)
grad_rates_1718 = [rate[0] for rate in cursor.fetchall()]
#Then, let's define a class that takes an input of a list of graduation
#rates, and computes the mean, variance, and std. for variance.
class Gradrates:
def __init__(self, rate_list):
self.rate_list = rate_list
self.mean = sum(self.rate_list)/len(rate_list)
def variance(self):
sum_of_squared_totals = 0
for grad_rate in self.rate_list:
sum_of_squared_totals += (grad_rate - self.mean)**2
grad_rate_var = sum_of_squared_totals/len(self.rate_list)
return grad_rate_var
def std(self):
stand_dev = math.sqrt(self.variance())
return(stand_dev)
grad_1617 = Gradrates(grad_rates_1617)
grad_1718 = Gradrates(grad_rates_1718)
print("Mean graduation rate 2016/17: %s" %grad_1617.mean)
print("Mean graduation rate 2017/18: %s" %grad_1718.mean)
print("Variance 2016/17: %s" %grad_1617.variance())
print("Variance 2017/18: %s" %grad_1718.variance())
There is more variance in 17/18 graduation rates.
There are some outliers in the data. Let's look at the values which fall outside of two standard deviations.
outliers_1617 = [x for x in grad_rates_1617 if
(abs(x - grad_1617.mean) > (grad_1617.std() * 2))]
outliers_1718 = [x for x in grad_rates_1718 if
(abs(x - grad_1718.mean) > (grad_1718.std() * 2))]
print("Outiers 2016/17: %s" %outliers_1617)
print("Outiers 2017/18: %s" %outliers_1718)
Let's find those schools in our database.
cursor = conn.cursor()
outliers_1617_qry = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1617
WHERE graduation_rate_school < 28"""
cursor.execute(outliers_1617_qry)
cursor.fetchall()
cursor = conn.cursor()
outliers_1718_qry = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1718
WHERE graduation_rate_school < 28"""
cursor.execute(outliers_1718_qry)
cursor.fetchall()
With some quick google searches, one can intuit the reason for these low graduations rates of these schools. Northside Learning HS is a school for students with " significant intellectual disabilities and impaired adaptive functioning"; Vaughn provides "a specialized education for high school students with cognitive, developmental and multiple disabilities"; York is a high school run out of Cook County Jail; Jefferson teaches students at Cook County Juvenile Detention Center; Simpson educates women who are pregrant or teen mothers. Pathways is part of "a national network of non-profit public schools and learning programs that partner with local school districts to reengage at-risk youth." Noble causes all, but including the graduation rates in this study is unfair and would throw off the analysis.
cursor = conn.cursor()
pathways = """SELECT school_id, short_name, graduation_rate_school FROM allschools_1718
WHERE school_id = 610557"""
cursor.execute(pathways)
cursor.fetchall()
hs_gr_1617 = '''SELECT graduation_rate_school FROM allschools_1617
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL'''
cursor.execute(hs_gr_1617)
gr_1617 = [gr[0] for gr in cursor.fetchall()]
sns.distplot(gr_1617, color='b')
plt.xlim(0,100)
plt.xlabel("Percentage of Students Graduating")
plt.title('CPS Graduation Rates 16/17')
hs_gr_1718 = '''SELECT graduation_rate_school FROM allschools_1718
WHERE is_high_school = 'Y'
AND graduation_rate_school IS NOT NULL'''
cursor.execute(hs_gr_1718)
gr_1718 = [gr[0] for gr in cursor.fetchall()]
sns.distplot(gr_1718, color='orange')
plt.xlim(0, 100)
plt.xlabel("Percentage of Students Graduating")
plt.title('CPS Graduation Rates 17/18')
After superimposing the two histograms, we see increase of grad rates in 2017/18 in comparison to 2016/17
hist_plt = [gr_1617, gr_1718]
color = ['blue', 'orange']
year = ['2016/17', '2017/18']
for a,c, y in zip(hist_plt, color, year):
sns.distplot(a, color=c, label=y)
plt.xlim(0,100)
plt.xlabel("Percentage of Students Graduating")
plt.legend()
plt.title("HS Grad Rates 16/17 and 17/18")
print("Skew for 2016/17 school year: %s" %scipy.stats.skew(gr_1617))
print("Skew for 2017/18 school year: %s" %scipy.stats.skew(gr_1718))
Since the graduation rates are both heavily skewed left, they will need to be reflected along the x-axis and log-transformed before regression. See feature engineering notebook for the transformation code. The database has a table with model ready data, which we'll work with from here on out.
query = ('''SELECT * FROM hs_modeldata WHERE log_grad_rate IS NOT NULL''');
df = pd_sql.read_sql(query, conn)
The skew is much reduced with the log-transformed graduation rates.
print("Skew for 2016/17 after log-transformation: %s"
%scipy.stats.skew(df[df.school_year=='School Year 2016-2017'].log_grad_rate))
print("Skew for 2017/18 after log-transformation: %s"
%scipy.stats.skew(df[df.school_year=='School Year 2017-2018'].log_grad_rate))
sns.distplot(df[df.school_year=='School Year 2016-2017'].log_grad_rate, color='b')
plt.title('Log-transformed and reflected\n CPS graduation rates, 2016/17')
sns.distplot(df[df.school_year=='School Year 2017-2018'].log_grad_rate, color='orange')
plt.title('Log-transformed and reflected\n CPS graduation rates, 2017/18')
hist_plt_lg = [df[df.school_year=='School Year 2016-2017'].log_grad_rate,
df[df.school_year=='School Year 2017-2018'].log_grad_rate]
color = ['blue', 'orange']
year = ['2016/17', '2017/18']
for a,c, y in zip(hist_plt_lg, color, year):
sns.distplot(a, color=c, label=y)
plt.legend()
plt.title("Log Transformed CPS\n Grad Rates 16/17 and 17/18")
Now we will take a look at the features.
demographics = ['student_count_total','student_count_asian',
'student_count_asian_pacific_islander',
'student_count_black','student_count_english_learners',
'student_count_ethnicity_not_available',
'student_count_hawaiian_pacific_islander',
'student_count_hispanic','student_count_low_income',
'student_count_multi','student_count_native_american',
'student_count_other_ethnicity',
'student_count_special_ed','student_count_white']
for dem in demographics:
hs_records_1718 = ('''SELECT SUM(%s) FROM allschools_1718
WHERE is_high_school = 'Y'
''' %dem)
cursor = conn.cursor()
cursor.execute(hs_records_1718)
print('''There are %s %s students in high schools in the 2017/18 school year'''
%(cursor.fetchall()[0][0], dem))
# hs_gr_null_1718 = ('''SELECT COUNT(school_id) FROM allschools_1718
# WHERE is_high_school = 'Y'
# AND graduation_rate_school IS NULL
# ''')
#pairplot is huge, so it is saved as an svg in figures/eda.
#sns.pairplot(df)
#plt.savefig('figures/EDA/pairplot_all.svg')
df_dem_subset = df[['log_grad_rate','student_count_total','student_count_asian_perc',
'student_count_asian_pacific_islander_perc',
'student_count_black_perc','student_count_english_learners_perc',
'student_count_ethnicity_not_available_perc',
'student_count_hawaiian_pacific_islander_perc',
'student_count_hispanic_perc','student_count_low_income_perc',
'student_count_multi_perc','student_count_native_american_perc',
'student_count_other_ethnicity_perc',
'student_count_special_ed_perc','student_count_white_perc']]
sns.pairplot(df_dem_subset, height=1.2, aspect=1.5);
scipy.stats.skew(df['log_grad_rate'])
sns.boxplot(df['log_grad_rate'])
sns.kdeplot(df['log_grad_rate'])
#Try removing outliers outside of 3 STD of the mean.
df['log_grad_rate'].mean()
df_remove_outliers = df[(np.abs(df.log_grad_rate-df.log_grad_rate.mean())
<= 3*df.log_grad_rate.std())]
df_remove_outliers.head()
sns.distplot(df_remove_outliers['log_grad_rate'])
df_remove_outliers_2std = df[(np.abs(df.log_grad_rate-df.log_grad_rate.mean())
<= 2*df.log_grad_rate.std())]
sns.distplot(df_remove_outliers_2std['log_grad_rate'])
df.corr()
# example of a better corr matrix
sns.heatmap(df.corr(), cmap="seismic", annot=False, vmin=-1, vmax=1);